This data assay is related to the project Phenotypic screen of Golgi apparatus by Automation of FIB/SEM.
Images are from Hela Kyoto cells tagged with GFP on GalNAc-T2, fluorescent Golgi apparatus on green channel. DAPI staining of nuclei is on blue channel.
Cells were transfected via spotting on top of MatTek glass bottom 2ml. dishes. MatTek dishes in a concentration of 100.000 cells/ml.
It is divided into 4 sections :
1) Data Loading and preprocessing : Data is loaded using the loadData function and quality controls are applied on the images.
2) Plot images: Random images of the full assay are shown. General features are plotted.
3) Select cells based on features, PCA or tSNE: Selection of images (Which images where selected to be acquired by EM?) with corresponding names and a folder with individual crops. This can be done by plotting a feature or by other clustering technique like PCA or tSNE
Data for each experiment is saved as the data followed by the suffix “_automation”. For each experiment data is stored in two separate folders, one containing all the images of the screening from the selected cells, and other with suffix “--cp” containing all the images from the pre-scanning of the spots with additional files containing comma separated values with information about the images.
Image features and metadata is extracted using a CellProfiler Pipeline v 2.2 (in the same folder). The CellProfiler pipeline first segmented the cells and then it extracted features from each identified Golgi region in the green channel. Cell position was identified from the nucleus associated to each cell.
loadData searches for a folder with "--cp" extension and inside searches for the files generated by the pipeline. Information is merged in a unique dataframe that will be used during the full experiment, a table structure where each column represent a feature and each row a cell found in the images.
%load_ext autoreload
%autoreload 2
from validation_pipeline import *
# Folder where the automation data is. The folder should look something like date+_automation
# - It will merge together all the data tables from Cell Profiler. Be sure that your output folder has the name --cp
# - It will also generate crops of every cell ready for the plots.
df = loadData(regenerate_crops = True, no_treatments=False)
import pandas as pd
import numpy as np
from validation_pipeline import plotQC, applyQC
from bokeh.io import output_notebook
output_notebook()
## START HERE IF YOU WANT TO REPEAT THE CONTROLS
df_qc = df.copy()
df.head()
import os
from ipywidgets import widgets, HBox,Label,Button
%gui qt
get_ipython().enable_gui("qt")
bs = Button( description='Show selected',tooltip='Show in viewer',)
dfolder = os.getcwd() # os.getcwd()+'\\4927--cp\\images\\'
Values are between -2.2 to 100. A common value is -2.2. Values below -2.2 are removed.
mplot, selection_stream = plotQC(df_qc, "ImageQuality_PowerLogLogSlope_Dna","Focus_Quality", size = 8)
def openVisor(bval):
images_list = [ df_qc.at[i,'img_name_raw'] for i in selection_stream.index]
images_values = [ str(df_qc.at[i,'ImageQuality_PowerLogLogSlope_Dna']) for i in selection_stream.index]
if len(images_list)>0:
%run -i visorOk.py --listImages $images_list --folder $dfolder --listIndex $selection_stream.index --listValues $images_values
bs.on_click(openVisor)
display(widgets.HBox([bs]))
mplot
### WRITE HERE
from ipywidgets import widgets, HBox,Label
### TYPE DOWN HERE YOUR RESULT for detecting artifacts or apoptosis:
# displaying the text widget
qc_focus_quality = widgets.FloatText(width=300)
hb = HBox([Label('Insert here value for Focus quality:'),qc_focus_quality])
display(hb)
def handle_submit_1(sender):
print("You selected the value :"+str(qc_focus_quality.value))
return
qc_focus_quality.observe(handle_submit_1,'value')
df_qc = applyQC(df_qc, "ImageQuality_PowerLogLogSlope_Dna",vmin= qc_focus_quality.value)
Mean Nuclei CV – Coefficient of variation of the intensity. Usually is around 0.4 as maximum. Cells that have more than value are removed.
mplot, selection_stream = plotQC(df_qc, "Mean_Nuclei_Math_CV","Nuclei_Variation")
display(widgets.HBox([bs]))
mplot
### WRITE HERE
from ipywidgets import widgets, HBox,Label
### TYPE DOWN HERE YOUR RESULT for detecting artifacts or apoptosis:
# displaying the text widget
qc_apo = widgets.FloatText(width=300)
hb = HBox([Label('Insert here value to remove apoptotic shapes:'),qc_apo])
display(hb)
def handle_submit_2(sender):
print("You selected the value :"+str(qc_apo.value))
return
qc_apo.observe(handle_submit_2,'value')
if qc_apo.value:
df_qc = applyQC(df_qc, "Mean_Nuclei_Math_CV",vmax= float(qc_apo.value))
Potential Mitotic cells- Integrated Intensity of Golgi background. Images which correlation values with the background are very close are removed. Range allowed was anything bigger than 30.
plotqc,selection_stream = plotQC(df_qc, "Intensity_IntegratedIntensity_GolgiBGCorr","Background_intensity")
display(widgets.HBox([bs]))
plotqc
### TYPE DOWN HERE YOUR RESULT for remove low background cells:
### WRITE HERE
from ipywidgets import widgets, HBox,Label
qc_bg = widgets.FloatText(width=300)
hb = HBox([Label('Insert here value for Mitotic cells:'),qc_bg])
display(hb)
def handle_submit(sender):
print("You selected the value :"+str(qc_bg.value))
return
qc_bg.observe(handle_submit,'value')
# Allowed range: 30 .. 1e+09 and not NA.
df_qc = applyQC(df_qc, "Intensity_IntegratedIntensity_GolgiBGCorr",vmin= qc_bg.value, vmax = 2000)
# CHECKPOINT 1: Execute this if you want to start from this point again.
df = df_qc
print(len(df))
from ipywidgets import Button, HBox, VBox
# First provide a dictionary with the names of the genes. It will be easy to identify by name than by number
genes = dict({'ACTR3': [10, 17],
'ARHGAP44': [15, 25],
'AURKB': [31],
'C1S': [4, 19],
'COPB1': [24],
'COPB2': [0],
'COPG1': [3],
'DENND4C': [5, 30],
'DNM1': [2, 12],
'FAM177B': [9, 23],
'GPT': [27, 29],
'IPO8': [6, 20],
'KIF11': [28],
'NT5C': [13, 22],
'Neg9': [8, 16, 26],
'PTBP1': [11, 14],
'SRSF1': [7, 18],
'WDR75': [1, 21]})
list_genes = []
def on_button_clicked(b):
print("You selected: "+b.description)
list_genes.append(b.description)
words = genes.keys()
items = [Button(description=w, tooltip=w) for w in words]
for b in items:
b.on_click(on_button_clicked)
leftb = VBox(items[0:int(len(items)/2)])
rightb = VBox(items[int(len(items)/2):len(items)])
print('Select controls.')
HBox([leftb,rightb])
%reload_ext autoreload
%autoreload 2
import cv2
import re
from validation_pipeline import findImagesGene
from matplotlib import pyplot as plt
import os
def plot_treatment(genes,list_genes):
positive_controls = list_genes
# For this we have to provide the folder where the data of the prescan is.
folder_data = os.getcwd()
loi = findImagesGene(df,folder_data,genes,positive_controls)
for gene in loi.keys():
m_files = loi[gene]
fig = plt.figure(figsize=(20,25))
total = int(len(m_files)/2)+1
fig.suptitle(gene)
for i,mfile in enumerate(m_files):
img = cv2.imread(mfile)
if img is not None:
b,g,r = cv2.split(img) # get b,g,r
rgb_img = cv2.merge([r,g,b])
ax = fig.add_subplot(total, 2, i + 1, xticks=[], yticks=[])
ax.imshow(np.squeeze(rgb_img))
plot_treatment(genes,list_genes)
There is a total of 4 features : tubularity, diffusivity, fragmentation and condensation. The idea is that phenotypes show always one extreme difference on one of the features or a mixture of two of them, enough to be differentiated between phenotypes.
From the original data coming from a CellProfiler Pipeline, Blobness corresponds to tubularity, Diffusseness to diffusivity, Children_Golgi_Count to Fragmentation and Mean_MaxAreaGolgi_AreaShape_FormFactor to condensation.
# RENAME of Features and data conditioning needed is done here
# This prepares the data for feature inspection with nicer names
df.loc[:,'Children_Golgi_Count']=df.loc[:,'Children_Golgi_Count'].astype(int)
# Conversion needed because by default is read as an string.
from validation_pipeline import getZscoreAgainstControl
# STEP 1. NORMALIZATION PER GENE against NEGATIVE CONTROL
df_features_gene = df[['Math_Blobness', 'Math_Diffuseness', 'Children_Golgi_Count',
'Mean_MaxAreaGolgi_AreaShape_FormFactor','Gene','Mean_Nuclei_AreaShape_Solidity']]
df_features_gene_norm = getZscoreAgainstControl(df_features_gene,hue='Gene',control=['Neg9'],remove_outliers=False, drop_index = False)
df_features_gene_norm = df_features_gene_norm[['index','Math_Blobness_zscore', 'Math_Diffuseness_zscore', 'Children_Golgi_Count_zscore',
'Mean_MaxAreaGolgi_AreaShape_FormFactor_zscore','Mean_Nuclei_AreaShape_Solidity_zscore','Gene']]
df_features_gene_norm.columns = ['index',"Tubular","Diffuse","Fragmented","Condensed","Nuc_Solidity","Gene"]
print(len(df_features_gene_norm))
df_features_gene_norm.head(10)
from ipywidgets import Button, HBox, VBox
list_genes = []
def on_button_clicked(b):
print("You selected the gene: "+b.description)
list_genes.append(b.description)
words = genes.keys()
items = [Button(description=w, tooltip=w) for w in words]
for b in items:
b.on_click(on_button_clicked)
leftb = VBox(items[0:int(len(items)/2)])
rightb = VBox(items[int(len(items)/2):len(items)])
HBox([widgets.Label(value="Select Treatments to plot: "),leftb,rightb])
import pylab as Plot
import pandas as pd
import matplotlib.patches as mpatches
import seaborn as sns
from validation_pipeline import nlcmap
def strip_plot_features(iXz, iXz_control,isize=20,columns=["Tubular","Diffuse","Fragmented","Condensed","Nuc_Solidity"],tags=['Phenotype','Control']):
# sns.set_style("whitegrid")
Xz = iXz[columns]
if iXz_control is not None:
Xz_control = iXz_control[columns]
Xz_control = pd.melt(Xz_control)
Xz = pd.melt(Xz)
Xz.loc[:,'Type'] = tags[0]
Xz_control.loc[:,'Type']=tags[1]
my_df_t = pd.concat([Xz,Xz_control])
ax = sns.stripplot(x='variable',y='value',hue='Type',edgecolor='black',linewidth=1,data=my_df_t,palette = ["#e74c3c","#3498db"],dodge=True, jitter=0.25, alpha=0.25, size=isize)
ax.axes.set_xlim(-1, 5)
classes = tags
class_colours = ["#e74c3c","#3498db"]
recs = []
for i in range(0, len(class_colours)):
recs.append(mpatches.Rectangle((0, 0), 1, 1, fc=class_colours[i]))
Plot.legend(recs, classes, loc=4)
return ax
else:
ax = sns.stripplot(data=Xz, jitter=True, alpha=0.25, size=isize)
ax = sns.violinplot(data=Xz, inner = None, color = ".9")
# sns.despine()
# distance across the "X" or "Y" stripplot column to span, in this case 40%
median_width = 0.4
for tick, text in zip(ax.get_xticks(), ax.get_xticklabels()):
sample_name = text.get_text() # "X" or "Y"
median_val = Xz[sample_name].median()
# plot horizontal lines across the column, centered on the tick
ax.plot([tick - median_width / 2, tick + median_width / 2], [median_val, median_val],
lw=4, color='k')
#Plot.show()
#Plot.close()
return ax
def getFeatures(idf_features,gene):
features_pheno = idf_features.query('Gene==\''+gene+'\'').copy()
features_pheno = features_pheno[["Tubular","Diffuse","Fragmented","Condensed","Nuc_Solidity"]]
return features_pheno
features_c = getFeatures(df_features_gene_norm,'Neg9')
Plot.rcParams['figure.figsize'] = (15, 8)
fig, ax = plt.subplots(len(list_genes), 0)
for ind,gene in enumerate(list_genes):
features_p = getFeatures(df_features_gene_norm,gene)
if len(list_genes)>1:
ax[ind]= strip_plot_features(features_p, features_c,isize=5,columns=["Tubular","Diffuse","Fragmented","Condensed","Nuc_Solidity"],tags=[gene,'CONTROL'])
else:
ax= strip_plot_features(features_p, features_c,isize=5,columns=["Tubular","Diffuse","Fragmented","Condensed","Nuc_Solidity"],tags=[gene,'CONTROL'])
Plot.show()
df = df.reset_index(drop=False)
res = df_features_gene_norm.join(df,on='index',rsuffix='_right')
res= res.drop(columns=['index','index_right', 'Gene_right'])
print("Total cells so far:"+str(len(res)))
res.head()
#CHECKPOINT 2 : Execute this if you want to restart from here.
final = res.copy()
from ipywidgets import Button, HBox, VBox
list_genes = []
def on_button_clicked(b):
print("You selected the gene: "+b.description)
list_genes.append(b.description)
words = genes.keys()
items = [Button(description=w, tooltip=w) for w in words]
for b in items:
b.on_click(on_button_clicked)
leftb = VBox(items[0:int(len(items)/2)])
rightb = VBox(items[int(len(items)/2):len(items)])
HBox([widgets.Label(value="Select Genes to delete: "),leftb,rightb])
list_ind = []
for el in list_genes:
final = final[final.Gene!=el]
print("Deleted treatment :"+el)
final = final.reset_index(drop=True) #### VERY IMPORTANT
final.head(10)
# Some functions needed for later
def save_images(df):
# For this we have to provide the folder where the data of the prescan is.
os.mkdir(".\\selected")
data_folder = os.getcwd()
images_dir = data_folder
df = df.reset_index(drop=True)
total = int(len(df)/2)+1
fig = plt.figure(figsize=(30,total*10))
k = 0
for i in range(len(df)):
mfile = images_dir+'\\'+df.at[i,"img_name_raw"]
name_or = df.at[i,"Metadata_BaseFileName"]
name = str(i)+"_"+str(df.at[i,"Gene"]+"_"+name_or)
img = cv2.imread(mfile)
b,g,r = cv2.split(img) # get b,g,r
rgb_img = cv2.merge([r,g,b])
if img is not None:
ax = fig.add_subplot(total, 2, k + 1, xticks=[], yticks=[])
rgb_img2 = cv2.merge([b,g,r])
ax.imshow(np.squeeze(rgb_img))
cv2.imwrite(".\\selected\\"+name+".tif",np.squeeze(rgb_img2))
k = k+1
### Parameters to plot
cell_indexes_final = []
size_dot = 5
separation_dots = 0.4 # jitter
cells_random = 20 # cells to select randomly from the button add random
def openVisor(bval):
images_list = [ final_tmp.at[final_tmp.index[i],'img_name_raw'] for i in selection_stream.index]
images_values = [ str(final_tmp.at[final_tmp.index[i],feature]) for i in selection_stream.index]
if len(images_list)>0:
%run -i visorOk.py --listImages $images_list --folder $dfolder --listIndex $selection_stream.index --listValues $images_values
def save_ind(bval):
for el in selection_stream.index:
indexes.add(el)
print("Indices saved : "+str(indexes))
return
def save_ind_randomly(bval):
cells_range = np.min([len(selection_stream.index),cells_random])
for el in range(cells_range):
index_random = random.choice(selection_stream.index)
indexes.add(index_random)
print("Indices saved : "+str(indexes))
return
def clear_indexes(bval):
global indexes
indexes = set()
print("All indexes cleared.")
return
b = widgets.Button(
description='Save indexes',
disabled=False,
tooltip='Selection is saved',
)
b2 = widgets.Button(
description='Show selected',
disabled=False,
tooltip='Show in viewer',
)
b3 = widgets.Button(
description='Add random',
disabled=False,
tooltip='Add randomly '+str(cells_random)+' cells from selected region',
)
b4 = widgets.Button(
description='Clear all indexes',
disabled=False,
tooltip='Clear all selections',
)
b.on_click(save_ind)
b2.on_click(openVisor)
b3.on_click(save_ind_randomly)
b4.on_click(clear_indexes)
from ipywidgets import Button, HBox, VBox, ToggleButton, Layout
def show_selected_cells(show_all_selected = False):
sel_cells = 3
items = []
items2 = []
boxes = []
i = 1
for index in indexes:
real_ind = final_tmp.index[index]
name = final_tmp.at[real_ind,'Gene']
file = open(final_tmp.at[real_ind,'img_name_raw'], "rb")
image = file.read()
items.append(ToggleButton(value = show_all_selected, icon='check', description=str(name+"_"+str(real_ind)), tooltip=str(name),layout=Layout(width='300px', height='50px')))
items2.append(widgets.Image(value=image,format='png',layout=Layout(width='300px', height='300px')))
if i%sel_cells == 0:
boxes.append(VBox([HBox(items),HBox(items2)]))
items = []
items2 =[]
i = 0
i = i+1
return display(VBox(boxes)), boxes
def save_selected_cells(boxes):
print("You selected: ")
to_save = {}
for vb in boxes:
for bSel in vb.children:
for m_button in bSel.children :
if m_button.value == True :
value = m_button.description
data= value.split('_')
treatment = data[0]
index = int(data[1])
if treatment not in to_save.keys():
to_save[treatment] = []
print(treatment+" "+str(index))
to_save[treatment].append(index)
return to_save
COPB1, COPB2 and COPG1
import os
%gui qt
title = "Diffusion"
feature ="Diffuse"
indexes = set()
final_tmp = final[final['Gene'].isin(['COPG1','COPB2','COPB1'])]
mp, selection_stream = plotQC(final_tmp, feature, title, size= size_dot,jitter = separation_dots, factor_reduce=0.8)
display(widgets.HBox([b, b2, b3, b4]))
mp
d, boxes = show_selected_cells()
d
to_save = save_selected_cells(boxes)
cell_indexes_final = to_save.copy()
cell_indexes_final.update(to_save)
DNM1, DENND4C, SRSF1
%gui qt
title = "Fragmented"
feature ="Fragmented"
indexes = set()
final_tmp = final[final['Gene'].isin(['DNM1','DENND4C','SRSF1'])]
mp, selection_stream = plotQC(final_tmp, feature, title, size= size_dot,jitter = separation_dots, factor_reduce=0.8)
display(widgets.HBox([b, b2, b3, b4]))
mp
d, boxes = show_selected_cells()
d
to_save = save_selected_cells(boxes)
cell_indexes_final.update(to_save)
cell_indexes_final
cell_indexes_final
ARHGAP44, FAM177B, IPO8, C1S
%gui qt
title = "Tubular"
feature ="Tubular"
indexes = set()
final_tmp = final[final['Gene'].isin(['ARHGAP44','FAM177B','IPO8','C1S'])]
mp, selection_stream = plotQC(final_tmp, feature, title, size= size_dot,jitter = separation_dots, factor_reduce=0.8)
display(widgets.HBox([b, b2, b3, b4]))
mp
d, boxes = show_selected_cells()
d
to_save = save_selected_cells(boxes)
cell_indexes_final.update(to_save)
cell_indexes_final
WDR75, PTBP1, ACTR3, NT5C, GPT and don't forget some negative controls (Neg 9)(Select all of them and apply Add random)
%gui qt
title = "Condensed"
feature ="Condensed"
indexes = set()
final_tmp = final[final['Gene'].isin(['WDR75', 'PTBP1', 'ACTR3', 'NT5C', 'GPT','Neg9'])]
mp, selection_stream = plotQC(final_tmp, feature, title, size= size_dot,jitter = separation_dots, factor_reduce=0.8)
display(widgets.HBox([b, b2, b3, b4]))
mp
d, boxes = show_selected_cells()
d
to_save = save_selected_cells(boxes)
cell_indexes_final.update(to_save)
cell_indexes_final
inds = []
for key, value in cell_indexes_final.items():
inds = inds+value
final_stage = final.loc[inds,:]
from scipy.spatial import KDTree
def listOfClosest(list_sem, radius = 100):
"""
Gives back the indices of elements to be removed
Notice that this method requires some sophistication.
In this approach we just leave the first element we find.
:param m_list_px:
:param m_list_py:
:param radius:
:return:
"""
cp_list = list_sem.copy()
# for the tree
ncp_list = list_sem.copy()
ncp_list_x = list(ncp_list['Location_Center_X'])
ncp_list_y = list(ncp_list['Location_Center_Y'])
tree_list = list(zip(ncp_list_x, ncp_list_y))
list_excluded = []
for mindex,row in cp_list.iterrows():
el = np.array((row['Location_Center_X'],row['Location_Center_Y']),dtype=np.float32)
distances_p, indexes = KDTree(tree_list).query(el,k=2) # closest element that is not yourself
if( distances_p[1] < radius):
list_excluded.append(mindex)
ncp_list = ncp_list.drop(df.index[[mindex]])
ncp_list_x = ncp_list['Location_Center_X']
ncp_list_y = ncp_list['Location_Center_Y']
tree_list = list(zip(ncp_list_x, ncp_list_y))
if(len(tree_list)<2):
break
return list_excluded
df_points = final_stage[['Location_Center_X','Location_Center_Y']]
list_ex = listOfClosest(df_points)
len(list_ex)
from ipywidgets import Button, HBox, VBox, ToggleButton, Layout
def show_selected_cells_excluded(mindexes):
sel_cells = 3
items = []
items2 = []
boxes = []
i = 1
for index in range(len(final_stage)):
real_ind = final_stage.index[index]
name = final_stage.at[real_ind,'Gene']
file = open(final_stage.at[real_ind,'img_name_raw'], "rb")
image = file.read()
if real_ind in mindexes:
items.append(ToggleButton(value = False, button_style='warning',icon='check', description=str(name+"_"+str(real_ind)), tooltip=str(name),layout=Layout(width='300px', height='50px')))
else:
items.append(ToggleButton(value = True, button_style='success',icon='check', description=str(name+"_"+str(real_ind)), tooltip=str(name),layout=Layout(width='300px', height='50px')))
items2.append(widgets.Image(value=image,format='png',layout=Layout(width='300px', height='300px')))
if i%sel_cells == 0:
boxes.append(VBox([HBox(items),HBox(items2)]))
items = []
items2 =[]
i = 0
i = i+1
return display(VBox(boxes)), boxes
d, boxes = show_selected_cells_excluded(list_ex)
d
final_saving = save_selected_cells(boxes)
inds = []
for key, value in final_saving.items():
inds = inds+value
final_df = final.loc[inds,:]
len(final_df)
final_df
from IPython.core.display import HTML
display(HTML(final_df.to_html()))
#hv.Div(final.to_html())
final_df.to_csv('selected_cells.csv')
save_images(final_df)